/********************************************************************
 *                                                                  *
 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
 *                                                                  *
 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003                *
 * by the Xiph.Org Foundation http://www.xiph.org/                  *
 *                                                                  *
 ********************************************************************

  function:
  last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $

 ********************************************************************/

#include "codec_internal.h"


static const unsigned __int64 V128 = 0x8080808080808080LL;

static void copy8x8__mmx (unsigned char *src,
	                unsigned char *dest,
	                unsigned int stride)
{

    //Is this even the fastest way to do this?
    __asm {
        align 16        

        mov         eax, src
        mov         ebx, dest
        mov         ecx, stride

        lea		    edi, [ecx + ecx * 2]
        movq		mm0, [eax]
        movq		mm1, [eax + ecx]
        movq		mm2, [eax + ecx * 2]
        movq		mm3, [eax + edi]
        lea		    eax, [eax + ecx * 4]
        movq		[ebx], mm0
        movq		[ebx + ecx], mm1
        movq		[ebx + ecx * 2], mm2
        movq		[ebx + edi], mm3
        lea		    ebx, [ebx + ecx * 4]
        movq		mm0, [eax]
        movq		mm1, [eax + ecx]
        movq		mm2, [eax + ecx * 2]
        movq		mm3, [eax + edi]
        movq		[ebx], mm0
        movq		[ebx + ecx], mm1
        movq		[ebx + ecx * 2], mm2
        movq		[ebx + edi], mm3

    };

}

static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
		      ogg_uint32_t LineStep)
{

    __asm {
        align 16

        mov         eax, ReconPtr
        mov         ebx, ChangePtr
        mov         ecx, LineStep

        movq		mm0, V128

        lea		    edi, [128 + ebx]
    loop_start:	
        movq		mm2, [ebx]

        packsswb	mm2, [8 + ebx]
        por		    mm0, mm0
        pxor		mm2, mm0
        lea		    ebx, [16 + ebx]
        cmp		    ebx, edi

        movq		[eax], mm2



        lea		    eax, [eax + ecx]
        jc		    loop_start


    };
    
}





static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
		      ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{

    __asm {

        align 16

        mov         eax, ReconPtr
        mov         ebx, ChangePtr
        mov         ecx, LineStep
        mov         edx, RefPtr
    
        pxor		mm0, mm0
        lea		    edi, [128 + ebx]

    loop_start:
        movq		mm2, [edx]

        movq		mm4, [ebx]
        movq		mm3, mm2
        movq		mm5, [8 + ebx]
        punpcklbw	mm2, mm0
        paddsw		mm2, mm4
        punpckhbw	mm3, mm0
        paddsw		mm3, mm5
        add		    edx, ecx
        packuswb	mm2, mm3
        lea		    ebx, [16 + ebx]
        cmp		    ebx, edi

        movq		[eax], mm2

        lea		    eax, [eax + ecx]
        jc		    loop_start

    };
}




static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
		           unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
			   ogg_uint32_t LineStep)
{
    __asm {
        align 16

        mov     eax, ReconPtr
        mov     ebx, ChangePtr
        mov     ecx, RefPtr1
        mov     edx, RefPtr2
                
        pxor		mm0, mm0
        lea		edi, [128 + ebx]

    loop_start:
        movq		mm2, [ecx]
        movq		mm4, [edx]

        movq		mm3, mm2
        punpcklbw		mm2, mm0
        movq		mm5, mm4
        movq		mm6, [ebx]
        punpckhbw		mm3, mm0
        movq		mm7, [8 + ebx]
        punpcklbw		mm4, mm0
        punpckhbw		mm5, mm0
        paddw		mm2, mm4
        paddw		mm3, mm5
        psrlw		mm2, 1
        psrlw		mm3, 1
        paddw		mm2, mm6
        paddw		mm3, mm7
        lea		ebx, [16 + ebx]
        packuswb		mm2, mm3
        add		ecx, LineStep
        add		edx, LineStep
        movq		[eax], mm2
        add		eax, LineStep
        cmp		ebx, edi
        jc		loop_start

    };

}




void dsp_mmx_recon_init(DspFunctions *funcs)
{
  TH_DEBUG("enabling accelerated x86_32 mmx recon functions.\n");
  funcs->copy8x8 = copy8x8__mmx;
  funcs->recon_intra8x8 = recon_intra8x8__mmx;
  funcs->recon_inter8x8 = recon_inter8x8__mmx;
  funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
}

